## load in packages
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format
from pandas import Series
from pandas import DataFrame
from pandas import concat
import numpy as np
from numpy import percentile
from collections import defaultdict
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import datetime
from math import sqrt
import itertools
## import keras related packages
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
## import packages for SARIMA
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
## import data visualization packages: matplotlib,seaborn,plotly
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import pyplot
import seaborn as sns
sns.set(color_codes=True)
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot, plot
from plotly import tools
import plotly.express as px
## import formating packages
import latex
import warnings
warnings.filterwarnings('ignore')
## import data quality pre-check package
import pandas_profiling
## read in data
data = pd.read_csv("CTA.csv")
## understand shape of data
data.shape
## Applied pandas profiling to get a general sense of the data
#data.profile_report(title='CTA Ridership Dataset')
#profile = data.profile_report(title='CTA Ridership Dataset')
#profile.to_file(output_file='CTA_report.html')
#profile
# define function to conduct data quality check, inputs are df, threshold and col
# df: dataframe you want to check data quality for;
# threshold: threshold you want to filter out missing values, must be float [0,1];
# col: numeric column that you want to transform to object
def data_quality_check(df,threshold,col):
# remove any column with > 30% missing values
df_new=df.dropna(thresh=threshold*len(df), axis=1)
# remove duplicate records and remain the first record
df_new=df_new.drop_duplicates(keep='first')
# change station_id column datatype to object
df_new[col]=df_new[col].astype('object')
## create a datetime column to help sort data by time
df_new['date1'] = df_new.date.astype('datetime64[ns]')
df_new=df_new.sort_values('date1')
return df_new
datan=data_quality_check(data,0.3,'station_id')
## calculate total rides per day
date_rides=datan.groupby(['date1'])[['rides']].agg('sum').reset_index()
## add daytype info for plot: I want to add daytype to hover text
date_rides_plot=pd.merge(date_rides,datan[['date1','daytype']],on='date1',how='inner').drop_duplicates()
## define function to visualize time series data with Rangeslider
def rangeslider(x,y,name,title):
fig = go.Figure()
hover_text = []
for index, row in date_rides_plot.iterrows():
hover_text.append(('date: {date1}<br>daytype:{daytype}<br>').format(
date1=row['date1'],daytype=row['daytype']))
fig.add_trace(go.Scatter(x=x, y=y, name=name,hovertext=hover_text,
line_color='deepskyblue'))
fig.update_layout(title_text=title,
xaxis_rangeslider_visible=True)
return fig.show()
rangeslider(date_rides_plot.date1,date_rides_plot.rides,"total rides",'Total rides across time (Time Series with Rangeslider)')
## Create function for countplot for categorical variable
def count_plot(x,title,data):
a4_dims = (8,5)
fig, ax = pyplot.subplots(figsize=a4_dims)
sns.set(style="whitegrid")
g = sns.countplot(x=x, data=data,ax=ax).set_title(title,size=20)
return g
count_plot("daytype",'countplot for daytype',datan)
## Create function for barplot for categorical variable VS rides
def bar_plot(x,y,data):
a4_dims = (8,5)
fig, ax = pyplot.subplots(figsize=a4_dims)
sns.set(style="whitegrid")
g = sns.barplot(x=x, y=y, data=data,ax=ax)
return g
## calculate total rides for each daytype
daytype_rides=datan.groupby(['daytype'])[['rides']].agg('sum').sort_values(by=['daytype']).reset_index()
## create barplot of rides with different daytype and number tags on it
g = bar_plot("daytype","rides",daytype_rides)
for index, row in daytype_rides.iterrows():
g.text(row.name,row.rides,row.rides, color='black', ha="center")
g.set_title('barplot of rides with different daytype',size=20)
plt.show()
sns.distplot(datan['rides']).set_title('Distribution of rides',size=20)
sns.set(style="whitegrid")
ax = sns.boxplot(x=datan['rides']).set_title('boxplot of rides',size=20)
## calculate total rides for each station
station_count=datan.groupby(['date1']).count().reset_index()
station_count.stationname.unique()
## calculate total rides for each station
station_rides=datan.groupby(['stationname'])[['rides']].agg('sum').sort_values(by=['rides'],ascending=False).reset_index()
## create barplot of rides with different station_name
sns.set(rc={'figure.figsize':(15,7)})
g = sns.barplot(x="stationname", y="rides", data=station_rides)
g.set(xticklabels=[],title = 'barplot of stationname')
## filter out top 10 stations with most rides
top10 = station_rides.iloc[1:11,]
## define function to plot top performing stations
def barplot(x,y,data,col):
a4_dims = (8, 5)
fig, ax = pyplot.subplots(figsize=a4_dims)
ax=sns.barplot(ax=ax,x=x,y=y,data=data,orient='h')
ax.set_title('# rides Distribution Across {}'.format(col),size=15)
return plt.show()
barplot('rides','stationname',top10,'stationname')
## define function to visualize time series data with Rangeslider
def rangeslider(x1,y1,name1,x2,y2,name2,title):
fig = go.Figure()
fig.add_trace(go.Scatter(x=x1, y=y1, name=name1,
line_color='#17BECF'))
fig.add_trace(go.Scatter(x=x2, y=y2, name=name2,
line_color='#7F7F7F'))
fig.update_layout(title_text=title,
xaxis_rangeslider_visible=True)
return fig.show()
x1 = datan[datan['stationname']=='Lake/State']['date1']
y1 = datan[datan['stationname']=='Lake/State']['rides']
x2 = datan[datan['stationname']=='Chicago/State']['date1']
y2 = datan[datan['stationname']=='Chicago/State']['rides']
rangeslider(x1,y1,'Lake/State',x2,y2,'Chicago/State','station rides across time (Time Series with Rangeslider)')
Seeing a panel data, I first thought of a regression analysis. X Variables to be included can be 'stationname','daytype','time since the first day' and Y variable thus being number of rides. However, with this large amount of data and a few features, a regression model won't be a good choice for production usage and it won't give me a low RMSE. Therefore, I decide to try with other models that can better capture time trend and are more suitable for time series analysis.
Also, if given more features about the rides, e.g. type of rides(car pool, luxury), location of stations(a map can be created and I can probably cluster the stations with information like population which can affect the rides), I might be able to build othe models like LGBM.
Here I would like to try models that have a better fit for time series problems.
## train test split for data with total rides information for each day
date_rides_training = date_rides[date_rides['date1'].dt.year<2017]
date_rides_test = date_rides[date_rides['date1'].dt.year==2017]
# Define the p, d and q parameters to take any value between 0 and 2
p = d = q = range(0, 2)
# Generate all different combinations of p, q and q triplets
pdq = list(itertools.product(p, d, q))
# Generate all different combinations of seasonal p, q and q triplets
seasonal_pdq = [(x[0], x[1], x[2], 7) for x in list(itertools.product(p, d, q))]
print('Parameter combinations for Seasonal ARIMA...')
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[1]))
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[2]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[3]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[4]))
## loop over all combination of parameters and store according AIC result into a dictionary
re = defaultdict(list)
for param in pdq:
for param_seasonal in seasonal_pdq:
try:
mod = sm.tsa.statespace.SARIMAX(date_rides_training['rides'],
order=param,
seasonal_order=param_seasonal,
enforce_stationarity=False,
enforce_invertibility=False)
results = mod.fit()
re[results.aic].extend([param, param_seasonal])
print('ARIMA{}x{}7 - AIC:{}'.format(param, param_seasonal, results.aic))
except:
continue
## fit the model with parameters that led to lowest AIC
mod = sm.tsa.statespace.SARIMAX(date_rides['rides'],
order=re[min(re.keys())][0],
seasonal_order=re[min(re.keys())][1],
enforce_stationarity=False,
enforce_invertibility=False)
results = mod.fit()
results.summary().tables[1]
results.plot_diagnostics(figsize=(15, 12))
plt.show()
## generate 1 year forecasting and plot predicted value against actual value
predicted = results.predict()[:365]
x1 = date_rides_test['date1']
x2 = date_rides_test['date1']
y1 = predicted
y2 = date_rides_test['rides']
name1 = 'predicted'
name2 = 'actual'
rangeslider(x1,y1,name1,x2,y2,name2,'SARIMA Forecast Result')
## calculate RMSE
RMSE = sqrt(mean_squared_error(y2,y1))
print(RMSE)
## extract training and testing data from raw data
ridesLSTM = date_rides[['date1','rides']].reset_index()
In order to prepare the data to feed into the LSTM network, I would like to do the following:
# convert our column to pandas series
series = pd.Series(ridesLSTM['rides'])
## create function to test stationary of data
## if test == 1, data is non-stationary and need transformation
## if test == 0, data is already stationary
def stationary_test(raw_values):
test = 0
result = adfuller(raw_values)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
if result[1] > 0.05:
test = 1
return test
## create a differenced series
def difference(dataset, interval=1):
diff = list()
for i in range(interval, len(dataset)):
value = dataset[i] - dataset[i - interval]
diff.append(value)
return Series(diff)
raw_values = series.values
# transform data to be stationary if it is non-stationary
if stationary_test(raw_values) == 1:
diff_values = difference(raw_values, 1)
else:
diff_values = raw_values
Looking at the ADF test result, I find p-value < 0.05, so I am confident to reject H0 which means that the data doesn't have a unit root and is stationary. (Since stationary data is easier to model and will very likely result in more skillful forecasts, if data is non-stationary, I will transform data to be stationary by calculating the first difference)
The following function takes a NumPy array of the raw time series data and a lag or number of shifted series to create and use as inputs. Here I use 1 as shift so as to use t-1 as the input of t. For the first value since we have no data, I fill it with 0.
# frame a sequence as a supervised learning problem
def timeseries_to_supervised(data, lag=1):
df = DataFrame(data)
columns = [df.shift(i) for i in range(1, lag+1)]
columns.append(df)
df = concat(columns, axis=1)
df.fillna(0, inplace=True)
return df
# transform data to be supervised learning
supervised = timeseries_to_supervised(diff_values, 1)
supervised_values = supervised.values
print(supervised.head())
# split data into train(2011-2016) and test-sets(2017) using index
train, test = supervised_values[0:5843], supervised_values[5843:6209]
Since the default activation function for LSTMs is the hyperbolic tangent (tanh), which outputs values between -1 and 1. I need to scale my data into the desired format.
# scale train and test data to [-1, 1]
def scale(train, test):
# fit scaler
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train)
# transform train
train = train.reshape(train.shape[0], train.shape[1])
train_scaled = scaler.transform(train)
# transform test
test = test.reshape(test.shape[0], test.shape[1])
test_scaled = scaler.transform(test)
return scaler, train_scaled, test_scaled
# transform the scale of the data
scaler, train_scaled, test_scaled = scale(train, test)
# inverse scaling for a forecasted value
def invert_scale(scaler, X, value):
new_row = [x for x in X] + [value]
array = np.array(new_row)
array = array.reshape(1, len(array))
inverted = scaler.inverse_transform(array)
return inverted[0, -1]
# invert differenced value
def inverse_difference(history, yhat, interval=1):
return yhat + history[-interval]
# make a one-step forecast
def forecast_lstm(model, batch_size, X):
X = X.reshape(1, 1, len(X))
yhat = model.predict(X, batch_size=batch_size, verbose=0)
return yhat[0,0]
## create a function to fit LSTM network
def fit_lstm(train, batch_size, nb_epoch, neurons):
X, y = train[:, 0:-1], train[:, -1]
X = X.reshape(X.shape[0], 1, X.shape[1])
model = Sequential()
model.add(LSTM(neurons, batch_input_shape=(batch_size, X.shape[1], X.shape[2]), stateful=True))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
for i in range(nb_epoch):
## I don't want to shuffle the data so I set shuffle = False
## verbose is set to 0 to avoid reporting extra info
model.fit(X, y, epochs=1, batch_size=batch_size, verbose=0, shuffle=False)
model.reset_states()
return model
# let's train
lstm_model = fit_lstm(train_scaled, 1, 30, 10)
predictions = list()
# let's predict for test case
for i in range(len(test_scaled)):
# make one-step forecast
X, y = test_scaled[i, 0:-1], test_scaled[i, -1]
yhat = forecast_lstm(lstm_model, 1, X)
# invert scaling
yhat = invert_scale(scaler, X, yhat)
# invert differencing
#yhat = inverse_difference(raw_values, yhat, len(test_scaled)+1-i)
# store forecast
predictions.append(yhat)
# report performance
rmse = sqrt(mean_squared_error(ridesLSTM['rides'][5844:6209], predictions[1:]))
print( rmse)
Since the data is big and tuning hyperparameters can be time consuming, I would suggest the following hyperparameters to tune using grid search given more time and computation ability.
Other factors to consider: adding more players. But I don't think is it suitable for this problem.
## prepare data for plotting
x1 = ridesLSTM['date1'][5844:6209]
x2 = ridesLSTM['date1'][5844:6209]
y1 = predictions[1:]
y2 = ridesLSTM['rides'][5844:6209]
name1 = 'predicted'
name2 = 'actual'
rangeslider(x1,y1,name1,x2,y2,name2,'LSTM Forecast Result')
df1 = pd.DataFrame(ridesLSTM['date1'][5844:6209]).reset_index(drop=True)
df2 = pd.DataFrame(predictions[1:]).reset_index(drop=True)
df3 = pd.DataFrame(ridesLSTM['rides'][5844:6209]).reset_index(drop=True)
pred2017 = pd.concat([df1, df2, df3], axis=1)
pred2017.rename(columns = {'date1':'Date', 0 :'prediction','rides':'actual'}, inplace = True)
pred2017.to_csv("predict2017.csv")
** details have been discussed in above sessions